Pre-analyses checks

Author

Laura Symul

Timing of visits

Code
# We load the MAE object
mae <- load_latest_mae(dir = str_c(data_dir(), "03_augmented_mae/")) 
clin <- MultiAssayExperiment::colData(mae) |> as.data.frame() |> as_tibble()
Code
tmp <- 
  clin |> 
  filter(!is.na(DAY)) |> 
  left_join(get_assay_wide_format(mae, "tax_16S_p", add_colData = FALSE) |> mutate(has_16S = TRUE), by = join_by(Barcode)) |> 
  filter(has_16S) |> 
  select(USUBJID, ARM, AVISITN, PIPV) 

bind_rows(
  tmp |> 
    select(USUBJID, ARM) |> 
    distinct() |> 
    dplyr::count(ARM) |> 
    mutate(Visit = "At least one visit"),
  tmp |> 
    filter(PIPV) |> 
    mutate(Visit = get_visit_labels(AVISITN)) |> 
    dplyr::count(ARM, Visit),
  tmp |> 
    filter(PIPV) |> 
    group_by(USUBJID) |> mutate(n = n()) |> ungroup() |> filter(n == max(n)) |> 
    select(USUBJID, ARM) |> 
    distinct() |> 
    dplyr::count(ARM) |> 
    mutate(Visit = "All planned in-person visits")
) |> 
  mutate(
    Visit = 
      Visit |> 
      factor(
        levels = 
          c(
            "At least one visit", 
            get_visit_labels(c(0:4, 7)) |> levels(), 
            "All planned in-person visits"
            )
        )
    ) |> 
  dplyr::rename(Arm = ARM) |> 
  pivot_wider(id_cols = Arm, names_from = Visit, values_from = n) |> 
  as.data.frame() |> column_to_rownames("Arm") |> 
  gt(
    rownames_to_stub = TRUE,
    caption = "Number of participants in each study arm (rows) with an available 16S rRNA sequencing data at any, each, or all planned in-person visits."
  ) |> 
  grand_summary_rows(
    fns =  list(label = "Total", id = "totals", fn = "sum"),
    fmt = ~ fmt_integer(.),
    side = "top"
  ) 
Number of participants in each study arm (rows) with an available 16S rRNA sequencing data at any, each, or all planned in-person visits.
At least one visit Pre-MTZ Post-MTZ Week 4 Week 8 Week 12 Week 24 All planned in-person visits
Total 211 208 203 181 172 179 165 139
LACTIN-V 142 140 138 124 120 123 113 99
Placebo 69 68 65 57 52 56 52 40
Code
g <- 
clin |> 
  filter(!is.na(DAY)) |> 
  left_join(get_assay_wide_format(mae, "tax_16S_p", add_colData = FALSE) |> mutate(has_16S = TRUE), by = join_by(Barcode)) |> 
  filter(has_16S) |> 
  ggplot() +
  aes(x = DAY, y = USUBJID, col = AVISITN |> floor() |> get_visit_labels(), shape = VISIT_TYPE) +
  geom_point() +
  facet_grid(ARM ~ ., scales = "free_y", space = "free_y") +
  scale_color_brewer("Visit", type = "qual", palette = 3, guide = guide_legend(direction = "vertical")) +
  scale_shape_manual("Visit type", values = c(16, 3), guide = guide_legend(direction = "vertical")) +
  theme(axis.text.y = element_blank(), legend.position = "right") +
  labs(
    x = "Study day",
    y = "Participants",
    subtitle = "Visits with 16S rRNA sequencing data"
  )
g 

Code
ggsave(g, filename = str_c(fig_out_dir(), "S1A.pdf"), width = 6, height = 8, device = cairo_pdf)
Code
clin |> 
  filter(AVISITN == 0, !is.na(DAY)) |> 
  ggplot(aes(x = DAY + 5)) + geom_histogram(binwidth = 1) +
  labs(
    title = "Screening visits timing distribution",
    x = "Day relative to first MTZ dose",
    y = "Number of participants"
  )

Code
# j <- which(mae$ARM == "Placebo")
# mae_Placebo <- mae[,j,]
# mae_Placebo <- mae_Placebo[,,c(3,9)]
# save(mae_Placebo, file = "mae_Placebo.RData")

Demographics characteristics

Code
mae <- load_latest_mae(str_c(data_dir(), "03_augmented_mae/"))
Loading mae_20250809.RDS
Code
clin <- MultiAssayExperiment::colData(mae) |> data.frame() |> as_tibble()
Code
race_and_edu <- 
  clin |> 
  filter(ARM %in% c("LACTIN-V", "Placebo"), !is.na(EDULVL)) |> 
  select(USUBJID, RACEGR1, RACEGR2, EDULVL) |> 
  distinct() |> 
  mutate(RACEGR2 = RACEGR2 |>  fct_relevel("Other", after = 1)) |> 
  arrange(RACEGR2) |> 
  mutate(RACEGR2 = RACEGR2 |> as.character() |> str_wrap(15) |> fct_inorder())


(Xsq <- chisq.test(table(race_and_edu$EDULVL, race_and_edu$RACEGR2)))
Warning in chisq.test(table(race_and_edu$EDULVL, race_and_edu$RACEGR2)):
Chi-squared approximation may be incorrect

    Pearson's Chi-squared test

data:  table(race_and_edu$EDULVL, race_and_edu$RACEGR2)
X-squared = 60.877, df = 8, p-value = 3.135e-10
Code
t_1 <- race_and_edu |> dplyr::count(RACEGR2, EDULVL) 

g_1 <- 
  ggplot(t_1, aes(x = RACEGR2, y = EDULVL, fill = RACEGR2)) +
  geom_tile(aes(alpha = n)) +
  geom_text(aes(label = n)) + 
  # scale_alpha(range = c(0,1)) +
  # scale_fill_gradient(low = "white", high = "steelblue") +
  xlab("Self-declared participants' race") + ylab("") +
  scale_fill_manual(values = c("purple1", "darkseagreen2", "gold3")) +
  guides(fill = "none", alpha = "none")

g_2 <- 
  race_and_edu |> 
  arrange(RACEGR2, RACEGR1) |>
  mutate(RACEGR1 = RACEGR1 |> str_wrap(12) |> fct_inorder()) |> 
  dplyr::count(RACEGR2, RACEGR1, EDULVL) |>
  ggplot(aes(x = RACEGR1, y = EDULVL, fill = RACEGR2)) +
  geom_tile(aes(alpha = n)) +
  geom_text(aes(label = n)) + 
  # scale_fill_gradient(low = "white", high = "steelblue") +
  xlab("Self-declared participants' race (detailed)") + ylab("") +
  scale_fill_manual(values = c("purple1", "darkseagreen2", "gold3")) +
  guides(fill = "none", alpha = "none")
Code
g_1 + g_2 + plot_layout(guides = "collect", widths = c(4, 8))

Checking for arm differences at baseline?

Code
mae <- load_latest_mae(str_c(data_dir(), "03_augmented_mae/"))
Loading mae_20250809.RDS
Code
clin <- MultiAssayExperiment::colData(mae) |> data.frame() |> as_tibble()

In this documents, we perform a series of check to ensure that the two groups (Lactin-V and placebo recipients) have similar characteristics at baseline.

For each of the domains (demographic variables, microbiota composition, cytokine/chemokine concentrations, behavioral variable), we check that balance for participants included for evaluating the primary outcome (= L. crispatus ≥ 50%) at week 12. We also repeat that analysis for participants with 16S data at week 24.

Code
week12_participants <- clin$USUBJID[!is.na(clin$AVISITN) & (clin$AVISITN == 4)]
week24_participants <- clin$USUBJID[!is.na(clin$AVISITN) & (clin$AVISITN == 7)]

Demographics (“Table 1”)

Code
dem_data <- 
  MultiAssayExperiment::colData(mae[,,"ASV_16S"]) |> 
  as_tibble() |> 
  select(USUBJID, ARM, AGE, RACEGR2, ETHNIC,  EDULVL, N_PAST_BV) |> 
  dplyr::rename(
    Age = AGE,
    `Self-declared race` = RACEGR2,
    Ethnicity = ETHNIC,
    Education = EDULVL,
    `Nb of past BV episodes` = N_PAST_BV
  )
Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
  removing 27361 sampleMap rows not in names(experiments)
  removing 762 colData rownames not in sampleMap 'primary'
Code
dem_data |> 
  filter(USUBJID %in% week12_participants) |> 
  distinct() |> 
  select(-USUBJID) |> 
  tbl_summary(by = ARM) |> 
  add_p() |> 
  modify_caption(caption = "Demographic variables of participants with microbiota composition data at week 12")
Demographic variables of participants with microbiota composition data at week 12
Characteristic LACTIN-V
N = 1231
Placebo
N = 561
p-value2
Age 30 (25, 36) 32 (25, 37) 0.8
Self-declared race

0.7
    Black/African American 44 (36%) 20 (36%)
    White 52 (42%) 21 (38%)
    Other 27 (22%) 15 (27%)
Ethnicity

0.8
    NOT HISPANIC OR LATINO 98 (80%) 44 (79%)
    HISPANIC OR LATINO 25 (20%) 12 (21%)
    NOT REPORTED 0 (0%) 0 (0%)
    UNKNOWN 0 (0%) 0 (0%)
Education

0.6
    Did not complete high school 10 (8.1%) 1 (1.8%)
    Completed high school 46 (37%) 21 (38%)
    Completed junior college 19 (15%) 10 (18%)
    Completed college (undergraduate degree) 35 (28%) 19 (34%)
    Completed graduate degree 13 (11%) 5 (8.9%)
Nb of past BV episodes

0.8
    None 7 (5.7%) 4 (7.1%)
    1-2 27 (22%) 10 (18%)
    3-4 20 (16%) 7 (13%)
    5 or more 61 (50%) 29 (52%)
    Unknown 8 (6.5%) 6 (11%)
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test; Fisher’s exact test
Code
dem_data |> 
  filter(USUBJID %in% week24_participants) |> 
  distinct() |> 
  select(-USUBJID) |> 
  tbl_summary(by = ARM) |> 
  add_p() |> 
  modify_caption(caption = "Demographic variables of participants with microbiota composition data at week 24")
Demographic variables of participants with microbiota composition data at week 24
Characteristic LACTIN-V
N = 1131
Placebo
N = 521
p-value2
Age 30 (25, 36) 33 (26, 37) 0.6
Self-declared race

0.8
    Black/African American 39 (35%) 20 (38%)
    White 48 (42%) 19 (37%)
    Other 26 (23%) 13 (25%)
Ethnicity

0.7
    NOT HISPANIC OR LATINO 90 (80%) 40 (77%)
    HISPANIC OR LATINO 23 (20%) 12 (23%)
    NOT REPORTED 0 (0%) 0 (0%)
    UNKNOWN 0 (0%) 0 (0%)
Education

0.6
    Did not complete high school 9 (8.0%) 1 (1.9%)
    Completed high school 40 (35%) 21 (40%)
    Completed junior college 17 (15%) 10 (19%)
    Completed college (undergraduate degree) 34 (30%) 15 (29%)
    Completed graduate degree 13 (12%) 5 (9.6%)
Nb of past BV episodes

0.7
    None 6 (5.3%) 4 (7.7%)
    1-2 25 (22%) 9 (17%)
    3-4 17 (15%) 6 (12%)
    5 or more 57 (50%) 27 (52%)
    Unknown 8 (7.1%) 6 (12%)
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test; Fisher’s exact test

-omics at baseline (pre- and post-MTZ)

16S rRNA sequencing data

Species composition

Code
species <- MultiAssayExperiment::assay(mae, "tax_16S_p") %>% t()

We first compare the species composition at the first two visits (pre- and post-MTZ) for participants included for primary outcome at week 12.

Code
PCoA_baseline(
  mae = mae, assay_name = "tax_16S_p", 
  visit = 0, included_participants = week12_participants
  )

Code
PCoA_baseline(
  mae = mae, assay_name = "tax_16S_p", 
  visit = 1, included_participants = week12_participants
  )

And for participants with microbiota composition at week 24.

Code
PCoA_baseline(
  mae = mae, assay_name = "tax_16S_p", 
  visit = 0, included_participants = week24_participants
  )

Code
PCoA_baseline(
  mae = mae, assay_name = "tax_16S_p", 
  visit = 1, included_participants = week24_participants
  )

ASV composition

Code
set.seed(12345)

g1 <- 
PCoA_baseline(
  mae = mae, assay_name = "ASV_16S_filtered", 
  visit = 0, included_participants = week12_participants
)

g2 <- 
PCoA_baseline(
  mae = mae, assay_name = "ASV_16S_filtered", 
  visit = 1, included_participants = week12_participants
)

g1

Code
g2

Code
  ggsave(g1, filename = str_c(fig_out_dir(), "S1B.pdf"), width = 13, height = 4, device = cairo_pdf)
Code
  ggsave(g2, filename = str_c(fig_out_dir(), "S1C.pdf"), width = 13, height = 4, device = cairo_pdf)

And for participants with microbiota composition at week 24.

Code
set.seed(12345)

PCoA_baseline(
  mae = mae, assay_name = "ASV_16S_filtered", 
  visit = 0, included_participants = week24_participants
)

Code
PCoA_baseline(
  mae = mae, assay_name = "ASV_16S_filtered", 
  visit = 1, included_participants = week24_participants
)

Topic composition

Code
gammas <- 
  get_assay_long_format(
    mae, assayname = "c_topics_16S_8", add_rowData = TRUE,
    feature_name = "topic", values_name = "prop"
    )
Code
plot_topic_composition_1_visit(gammas, visit = 0) +
  labs(subtitle = "Screening visit")

Code
test_v0 <- test_topic_comp_by_ARM(gammas, visit = 0)
Warning in DirichletReg::DR_data(y): some entries are 0 or 1 => transformation
forced
Code
test_v0$full %>% summary()
Call:
DirichletReg::DirichReg(formula = y ~ ARM, data = data)

Standardized Residuals:
                             Min       1Q   Median       3Q     Max
L. crispatus             -0.5435  -0.5397  -0.5355  -0.5211  2.2653
L. iners                 -0.7271  -0.7036  -0.6163  -0.3139  5.2491
L. jensenii              -0.5046  -0.5046  -0.5030  -0.4941  4.1606
Other L.                 -0.4960  -0.4960  -0.4960  -0.4937  6.0511
G. s./l. topic           -1.3330  -0.7035   0.3135   1.1489  2.8858
P. amnii topic           -0.9743  -0.7961  -0.1684   0.8594  3.6323
Ca. L. v. (BVAB1) topic  -0.7046  -0.7024  -0.6555   1.6499  5.5613
P. bivia topic           -0.6826  -0.6615  -0.6096   0.1651  5.4358

------------------------------------------------------------------
Beta-Coefficients for variable no. 1: L. crispatus
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.58571    0.09307 -17.038   <2e-16 ***
ARMPlacebo  -0.01513    0.16572  -0.091    0.927    
------------------------------------------------------------------
Beta-Coefficients for variable no. 2: L. iners
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.06996    0.09207 -11.622   <2e-16 ***
ARMPlacebo   0.02326    0.16367   0.142    0.887    
------------------------------------------------------------------
Beta-Coefficients for variable no. 3: L. jensenii
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.730140   0.093267 -18.550   <2e-16 ***
ARMPlacebo   0.002702   0.166056   0.016    0.987    
------------------------------------------------------------------
Beta-Coefficients for variable no. 4: Other L.
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.76144    0.09331 -18.878   <2e-16 ***
ARMPlacebo   0.03275    0.16608   0.197    0.844    
------------------------------------------------------------------
Beta-Coefficients for variable no. 5: G. s./l. topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.30113    0.08981  -3.353   0.0008 ***
ARMPlacebo   0.23248    0.15809   1.471   0.1414    
------------------------------------------------------------------
Beta-Coefficients for variable no. 6: P. amnii topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.83048    0.09144  -9.082   <2e-16 ***
ARMPlacebo   0.29098    0.16127   1.804   0.0712 .  
------------------------------------------------------------------
Beta-Coefficients for variable no. 7: Ca. L. v. (BVAB1) topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.13225    0.09221 -12.278   <2e-16 ***
ARMPlacebo  -0.04403    0.16417  -0.268    0.789    
------------------------------------------------------------------
Beta-Coefficients for variable no. 8: P. bivia topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.24259    0.09246 -13.440   <2e-16 ***
ARMPlacebo   0.09966    0.16421   0.607    0.544    
------------------------------------------------------------------
Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Log-likelihood: 3012 on 16 df (78 BFGS + 1 NR Iterations)
AIC: -5992, BIC: -5942
Number of Observations: 165
Link: Log
Parametrization: common
Code
anova(test_v0$full, test_v0$null)
Analysis of Deviance Table

Model 1: DirichletReg::DirichReg(formula = y ~ ARM, data = data)
Model 2: DirichletReg::DirichReg(formula = y ~ 1, data = data)

        Deviance N. par Difference df Pr(>Chi)
Model 1  -6023.6     16                       
Model 2  -6018.7      8      4.966  8   0.7612

No significant differences in microbiota composition at baseline when characterizing microbiota with topic composition.

Code
plot_topic_composition_1_visit(gammas, visit = 1) +
  labs(subtitle = "Randomization visit")

Code
test_v1 <- test_topic_comp_by_ARM(gammas, visit = 1)
Warning in DirichletReg::DR_data(y): some entries are 0 or 1 => transformation
forced
Code
test_v1$full |> summary()
Call:
DirichletReg::DirichReg(formula = y ~ ARM, data = data)

Standardized Residuals:
                             Min       1Q   Median       3Q     Max
L. crispatus             -0.5992  -0.5839  -0.5697  -0.5384  4.3991
L. iners                 -1.6329  -0.5201   0.5791   1.8365  2.6819
L. jensenii              -0.6164  -0.6148  -0.6011   0.1677  6.7281
Other L.                 -0.5461  -0.5461  -0.5448  -0.4932  6.2604
G. s./l. topic           -0.9899  -0.7721  -0.4348   0.6082  3.3974
P. amnii topic           -0.6527  -0.6037  -0.5801  -0.2980  3.4925
Ca. L. v. (BVAB1) topic  -0.6143  -0.6105  -0.5644  -0.3939  6.8855
P. bivia topic           -0.6929  -0.6492  -0.5593  -0.3589  4.1953

------------------------------------------------------------------
Beta-Coefficients for variable no. 1: L. crispatus
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.35436    0.09292 -14.576   <2e-16 ***
ARMPlacebo  -0.03243    0.16840  -0.193    0.847    
------------------------------------------------------------------
Beta-Coefficients for variable no. 2: L. iners
            Estimate Std. Error z value Pr(>|z|)  
(Intercept)  0.20237    0.08715   2.322   0.0202 *
ARMPlacebo   0.08262    0.15640   0.528   0.5973  
------------------------------------------------------------------
Beta-Coefficients for variable no. 3: L. jensenii
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.3033     0.0928 -14.043   <2e-16 ***
ARMPlacebo   -0.0160     0.1681  -0.095    0.924    
------------------------------------------------------------------
Beta-Coefficients for variable no. 4: Other L.
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.52298    0.09326 -16.330   <2e-16 ***
ARMPlacebo   0.02450    0.16889   0.145    0.885    
------------------------------------------------------------------
Beta-Coefficients for variable no. 5: G. s./l. topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.79417    0.09131  -8.698   <2e-16 ***
ARMPlacebo   0.34307    0.16333   2.101   0.0357 *  
------------------------------------------------------------------
Beta-Coefficients for variable no. 6: P. amnii topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.34068    0.09289 -14.433   <2e-16 ***
ARMPlacebo   0.17043    0.16768   1.016    0.309    
------------------------------------------------------------------
Beta-Coefficients for variable no. 7: Ca. L. v. (BVAB1) topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.30922    0.09282 -14.105   <2e-16 ***
ARMPlacebo  -0.12367    0.16848  -0.734    0.463    
------------------------------------------------------------------
Beta-Coefficients for variable no. 8: P. bivia topic
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.1811     0.0925 -12.768   <2e-16 ***
ARMPlacebo    0.1178     0.1671   0.705    0.481    
------------------------------------------------------------------
Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Log-likelihood: 2703 on 16 df (78 BFGS + 1 NR Iterations)
AIC: -5374, BIC: -5324
Number of Observations: 161
Link: Log
Parametrization: common
Code
anova(test_v1$full, test_v1$null)
Analysis of Deviance Table

Model 1: DirichletReg::DirichReg(formula = y ~ ARM, data = data)
Model 2: DirichletReg::DirichReg(formula = y ~ 1, data = data)

        Deviance N. par Difference df Pr(>Chi)
Model 1  -5405.7     16                       
Model 2  -5399.4      8     6.2048  8   0.6243
Code
gammas |> 
  filter(AVISITN %in% 0:1, EOSSTT == "COMPLETED", !is.na(ARM)) |> 
  ggplot() +
  aes(x = topic_label, y = prop, fill = ARM) +
  geom_boxplot(outlier.size = 0.5) +
  facet_grid(. ~ ifelse(AVISITN == 0, "Screening (pre-MTZ)", "Randomization (post-MTZ)") |> fct_inorder()) +
  scale_fill_manual("", values = get_arm_colors()) +
  theme(
    strip.background = element_rect(fill = "gray", color = NA),
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
    ) +
  xlab("Topic")

At the randomization visit, there are slightly more participants with subcommunity IV-B in the Placebo arm than in the Lactin-V arm.

CST distribution

Code
CSTs <- 
  get_assay_wide_format(mae, "CST") |>  
  unnest(cols = c(assay)) |> 
  mutate(Visit = str_c("Visit ", AVISITN))

For participants with microbiota data at week 12

Code
week12_CSTs <- 
  CSTs |> filter(AVISITN %in% 0:1, USUBJID %in% week12_participants)

 week12_CSTs |> 
  ggplot(aes(x = subCST, fill = CST)) +
  geom_bar() +
  facet_grid(ARM ~ Visit, scales = "free_y") +
  ylab("Nb of participants") +
   scale_fill_manual(values = get_topic_colors(week12_CSTs$CST |> unique() |> sort()))

Code
week12_CSTs_V0 <- 
  week12_CSTs |> 
  filter(AVISITN == 0) |>  
  select(subCST, ARM)

(
  V0_chisq <- 
    chisq.test(table(week12_CSTs_V0), simulate.p.value = TRUE)
)

    Pearson's Chi-squared test with simulated p-value (based on 2000
    replicates)

data:  table(week12_CSTs_V0)
X-squared = 5.7833, df = NA, p-value = 0.4878
Code
week12_CSTs_V1 <- 
  week12_CSTs |> 
  filter(AVISITN == 1) |>  
  select(subCST, ARM)

(
  V1_chisq <- 
    chisq.test(table(week12_CSTs_V1), simulate.p.value = TRUE)
)

    Pearson's Chi-squared test with simulated p-value (based on 2000
    replicates)

data:  table(week12_CSTs_V1)
X-squared = 13.312, df = NA, p-value = 0.02449

For participants with microbiota data at week 24

Code
week24_CSTs <- 
  CSTs |> filter(AVISITN %in% 0:1, USUBJID %in% week24_participants)

 week24_CSTs |> 
  ggplot(aes(x = subCST, fill = CST)) +
  geom_bar() +
  facet_grid(ARM ~ Visit, scales = "free_y") +
  ylab("Nb of participants") +
   scale_fill_manual(values = get_topic_colors(week12_CSTs$CST |> unique() |> sort()))

Code
week24_CSTs_V0 <- 
  week24_CSTs |> 
  filter(AVISITN == 0) |>  
  select(subCST, ARM)

(
  V0_chisq <- 
    chisq.test(table(week24_CSTs_V0), simulate.p.value = TRUE)
)

    Pearson's Chi-squared test with simulated p-value (based on 2000
    replicates)

data:  table(week24_CSTs_V0)
X-squared = 4.815, df = NA, p-value = 0.6167
Code
week24_CSTs_V1 <- 
  week24_CSTs |> 
  filter(AVISITN == 1) |>  
  select(subCST, ARM)

(
  V1_chisq <- 
    chisq.test(table(week24_CSTs_V1), simulate.p.value = TRUE)
)

    Pearson's Chi-squared test with simulated p-value (based on 2000
    replicates)

data:  table(week24_CSTs_V1)
X-squared = 13.215, df = NA, p-value = 0.03248

No differences at baseline, but some differences post-MTZ, at randomization.

Microbiota composition at baseline (pre-MTZ) by demographic characteristics

Code
library(tidyverse)
library(patchwork)
library(gtsummary)
library(magrittr)

tmp <- fs::dir_map("R", source)
tmp <- fs::dir_map("../../R/", source)

theme_set(theme_publication())
Code
mae <- load_latest_mae(str_c(data_dir(), "03_augmented_mae/"))
Loading mae_20250809.RDS
Code
clin <- MultiAssayExperiment::colData(mae) |> data.frame() |> as_tibble()

By self-declared race

Code
assay_name <- "tax_16S_p"

mb_wide <- get_assay_wide_format(mae, assay_name)
mb_wide <- mb_wide |> mutate(assay = assay |> as.data.frame() |> set_rownames(Barcode))
mb_wide_v0 <- mb_wide |> filter(AVISITN == 0, !is.na(ARM))

distance <- "bray"

comp_data <- mb_wide_v0$assay

BC_dist <- vegan::vegdist(comp_data, method = distance)
mds <- cmdscale(BC_dist, k = 4, eig = TRUE)


g_eig <-
  ggplot(tibble(k = 1:20, eig = mds$eig[1:20]), aes(x = k, y = eig)) +
  geom_bar(stat = "identity") +
  labs(title = "Top eigenvalues", subtitle = str_c('MDS using "',distance,'" on ', assay_name, ' data'))

mds_df <-
    tibble(
      PCo1 = mds$points[, 1],
      PCo2 = mds$points[, 2],
      PCo3 = mds$points[, 3],
      PCo4 = mds$points[, 4],
      Barcode = rownames(mds$points)
    ) |> 
    left_join(mb_wide_v0, by = "Barcode")

fit_ARM <- vegan::adonis2(comp_data ~ ARM, data = mb_wide_v0, permutations=999, method=distance)
fit_RACE <- vegan::adonis2(comp_data ~ RACEGR2, data = mb_wide_v0, permutations=999, method=distance)

fit_SITE <- vegan::adonis2(comp_data ~ ARM + RACEGR2 + SITENAME, data = mb_wide_v0, permutations=999, method=distance)


  g_arms <-
    ggplot(mds_df, aes(x = PCo1, y = PCo2, col = ARM)) +
    geom_point() +
    coord_fixed() +
    theme(legend.position = "right") +
    annotate(
      "label", x = Inf, y = Inf, hjust = 1, vjust = 1,
      label = str_c("p: ", (fit_ARM$`Pr(>F)`[1]) %>% round(2))
    ) +
    scale_color_manual("", values = get_arm_colors(), guide = guide_legend(direction = "vertical")) +
    ggtitle("PCoA by study arm")
  
  
  g_race <-
    ggplot(mds_df, aes(x = PCo1, y = PCo2, col = RACEGR2)) +
    geom_point() +
    coord_fixed() +
    theme(legend.position = "right") +
    annotate(
      "label", x = Inf, y = Inf, hjust = 1, vjust = 1,
      label = str_c("p: ", (fit_RACE$`Pr(>F)`[1]) %>% round(6))
    ) +
    scale_color_manual(
      "Self-declared race", values = c("purple1", "darkseagreen2", "gold3"), 
      guide = guide_legend(direction = "vertical")
    ) +
    ggtitle("PCoA by participants' self-declared race")
  
  
  g_site <-
    ggplot(mds_df, aes(x = PCo1, y = PCo2, col = SITENAME)) +
    geom_point() +
    coord_fixed() +
    theme(legend.position = "right") +
    annotate(
      "label", x = Inf, y = Inf, hjust = 1, vjust = 1,
      label = str_c("p: ", (fit_SITE$`Pr(>F)`[1]) |> round(3))
    ) +
    scale_color_brewer(
      "Study site", type = "qual", palette = 7,
      guide = guide_legend(direction = "vertical")
    ) +
    ggtitle("PCoA by study site")
Code
g_arms + 
  g_race +
  plot_spacer() +
  g_site +
  
  plot_layout(ncol = 2, widths = c(1, 1.2))

Code
df_load <- clin |> select(USUBJID, AVISITN, LOAD, RACEGR2) |> filter(AVISITN == 1)

g_load <- 
  df_load |> filter(!is.na(LOAD)) |> 
  ggplot() +
  aes(x = RACEGR2, y = log10(LOAD + 1), fill = RACEGR2) +
  geom_boxplot(alpha = 0.5, outlier.shape = NULL) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Self-declared race",
    y = "log10(post-MTZ total bacterial load + 1)"
  ) +
  scale_fill_manual(
    "Self-declared race", values = c("purple1", "darkseagreen2", "gold3")
  ) +
  guides(fill = "none")

df_shannon_v0 <-
  mb_wide_v0 |>
  mutate(
    shannon_v0 = assay |>  vegan::diversity(index = "shannon")
  ) |> 
  select(USUBJID, shannon_v0)

df_load <- df_load |> left_join(df_shannon_v0, by = join_by(USUBJID)) 

g_shannon <- 
  df_load |> filter(!is.na(shannon_v0)) |> 
  ggplot() +
  aes(x = RACEGR2, y = shannon_v0, fill = RACEGR2) +
   geom_boxplot(alpha = 0.5, outlier.shape = NULL) +
  geom_point(alpha = 0.5) +
  labs(
    x = "Self-declared race",
    y = "pre-MTZ α-diversity\n(Shannon index on taxa proportions)"
  ) +
  scale_fill_manual(
    "Self-declared race", values = c("purple1", "darkseagreen2", "gold3")
  ) +
  guides(fill = "none")

g_lm <- 
  df_load |> 
  filter(!is.na(LOAD), !is.na(shannon_v0)) |> 
  ggplot(aes(x = shannon_v0, y = log10(LOAD + 1))) +
  geom_point() +
  geom_smooth(method = "lm", formula = 'y ~ x') +
  labs(
    x = "pre-MTZ α-diversity\n(Shannon index on taxa proportions)",
    y = "log10(post-MTZ total bacterial load + 1)"
  )
Code
g_load + g_shannon + g_lm 

Summarizing figure

Code
race_and_edu <- 
  mae@colData |>
  as_tibble() |> 
  filter(ARM %in% c("LACTIN-V", "Placebo"), !is.na(EDULVL)) |> 
  select(USUBJID, RACEGR1, RACEGR2, EDULVL) |> 
  distinct() |> 
  mutate(RACEGR2 = RACEGR2 |>  fct_relevel("Other", after = 1)) |> 
  arrange(RACEGR2) |> 
  mutate(RACEGR2 = RACEGR2 |> as.character() |> str_wrap(15) |> fct_inorder())


(Xsq <- chisq.test(table(race_and_edu$EDULVL, race_and_edu$RACEGR2), simulate.p.value = TRUE))

    Pearson's Chi-squared test with simulated p-value (based on 2000
    replicates)

data:  table(race_and_edu$EDULVL, race_and_edu$RACEGR2)
X-squared = 60.877, df = NA, p-value = 0.0004998
Code
t_1 <- race_and_edu |> dplyr::count(RACEGR2, EDULVL) 

g_1 <- 
  ggplot(t_1, aes(x = RACEGR2, y = EDULVL, fill = RACEGR2)) +
  geom_tile(aes(alpha = n)) +
  geom_text(aes(label = n)) + 
  # scale_alpha(range = c(0,1)) +
  # scale_fill_gradient(low = "white", high = "steelblue") +
  xlab("Self-declared participants' race") + ylab("Education") +
  scale_fill_manual(values = c("purple1", "darkseagreen3", "gold3")) +
  guides(fill = "none", alpha = "none") +
  theme(
    axis.title.y = element_text(angle = 0, hjust = 1, vjust = 1, margin = margin(r = -80))
  )
Code
df_shannon_v0 <-
  get_assay_wide_format(mae, "tax_16S_p", add_colData = FALSE) |> 
  mutate(
    shannon_v0 = assay |>  vegan::diversity(index = "shannon")
  ) |> 
  select(Barcode, shannon_v0) |> 
  left_join(
    mae@colData |> 
      as_tibble() |> 
      select(Barcode, USUBJID, ARM, AVISITN, RACEGR2) |> 
      distinct(), 
    by = join_by(Barcode)
    ) |> 
  filter(AVISITN == 0, !is.na(ARM))

lm(shannon_v0 ~ RACEGR2, data = df_shannon_v0 |> mutate(RACEGR2 = RACEGR2 |> fct_relevel("White"))) |> summary()

Call:
lm(formula = shannon_v0 ~ RACEGR2, data = mutate(df_shannon_v0, 
    RACEGR2 = fct_relevel(RACEGR2, "White")))

Residuals:
    Min      1Q  Median      3Q     Max 
-1.9720 -0.1511  0.1097  0.3559  0.9980 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    1.95844    0.06149  31.851   <2e-16 ***
RACEGR2Black/African American  0.18528    0.08780   2.110    0.036 *  
RACEGR2Other                   0.14733    0.09855   1.495    0.136    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.55 on 205 degrees of freedom
Multiple R-squared:  0.02315,   Adjusted R-squared:  0.01362 
F-statistic:  2.43 on 2 and 205 DF,  p-value: 0.09061
Code
g_shannon <- 
  df_shannon_v0 |> 
  ggplot() +
  aes(x = RACEGR2, y = shannon_v0, fill = RACEGR2) +
  # geom_boxplot(alpha = 0.5, outlier.shape = NULL) +
  geom_violin(alpha = 0.5, draw_quantiles = 0.5, color = "white", linewidth = 1) +
  # geom_point(alpha = 0.5) +
  ggbeeswarm::geom_quasirandom(width = 0.3, size = 0.5, aes(col = RACEGR2)) +
  labs(
    x = "Self-declared race",
    y = "pre-MTZ α-diversity\n(Shannon index on taxa proportions)"
  ) +
  scale_fill_manual(
    "Self-declared race", values = c("purple1", "darkseagreen3", "gold3")
  ) +
  scale_color_manual(
    "Self-declared race", values = c("purple1", "darkseagreen3", "gold3")
  ) +
  guides(fill = "none", col = "none")
Code
i <- 2

g <- 
g_1 + labs(tag = LETTERS[i]) + 
  g_shannon + labs(tag = LETTERS[i + 1])

g

Code
ggsave(g, filename = str_c(fig_out_dir(), "S7BC.pdf"), width = 13, height = 4, device = cairo_pdf)

Session Info

Code
sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: aarch64-apple-darwin20
Running under: macOS Sequoia 15.6

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Brussels
tzcode source: internal

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] gtsummary_2.2.0 phyloseq_1.50.0 ggthemes_5.1.0  ggrepel_0.9.6  
 [5] gt_1.0.0        patchwork_1.3.0 magrittr_2.0.3  lubridate_1.9.4
 [9] forcats_1.0.0   stringr_1.5.1   dplyr_1.1.4     purrr_1.0.4    
[13] readr_2.1.5     tidyr_1.3.1     tibble_3.2.1    ggplot2_3.5.2  
[17] tidyverse_2.0.0

loaded via a namespace (and not attached):
  [1] sandwich_3.1-1              DirichletReg_0.7-2         
  [3] permute_0.9-7               rlang_1.1.6                
  [5] ade4_1.7-23                 matrixStats_1.5.0          
  [7] compiler_4.4.2              mgcv_1.9-1                 
  [9] vctrs_0.6.5                 reshape2_1.4.4             
 [11] pkgconfig_2.0.3             crayon_1.5.3               
 [13] fastmap_1.2.0               backports_1.5.0            
 [15] XVector_0.46.0              labeling_0.4.3             
 [17] rmarkdown_2.29              markdown_2.0               
 [19] tzdb_0.5.0                  ggbeeswarm_0.7.2           
 [21] UCSC.utils_1.2.0            miscTools_0.6-28           
 [23] xfun_0.52                   MultiAssayExperiment_1.32.0
 [25] zlibbioc_1.52.0             litedown_0.7               
 [27] GenomeInfoDb_1.42.3         jsonlite_2.0.0             
 [29] biomformat_1.34.0           rhdf5filters_1.18.1        
 [31] DelayedArray_0.32.0         Rhdf5lib_1.28.0            
 [33] broom_1.0.8                 parallel_4.4.2             
 [35] cluster_2.1.8.1             R6_2.6.1                   
 [37] stringi_1.8.7               RColorBrewer_1.1-3         
 [39] GenomicRanges_1.58.0        Rcpp_1.0.14                
 [41] SummarizedExperiment_1.36.0 iterators_1.0.14           
 [43] knitr_1.50                  zoo_1.8-14                 
 [45] base64enc_0.1-3             IRanges_2.40.1             
 [47] BiocBaseUtils_1.8.0         Matrix_1.7-3               
 [49] splines_4.4.2               igraph_2.1.4               
 [51] timechange_0.3.0            tidyselect_1.2.1           
 [53] rstudioapi_0.17.1           abind_1.4-8                
 [55] yaml_2.3.10                 vegan_2.6-10               
 [57] maxLik_1.5-2.1              codetools_0.2-20           
 [59] lattice_0.22-6              plyr_1.8.9                 
 [61] Biobase_2.66.0              withr_3.0.2                
 [63] evaluate_1.0.3              survival_3.8-3             
 [65] xml2_1.3.8                  Biostrings_2.74.1          
 [67] pillar_1.10.2               MatrixGenerics_1.18.1      
 [69] foreach_1.5.2               stats4_4.4.2               
 [71] generics_0.1.4              S4Vectors_0.44.0           
 [73] hms_1.1.3                   commonmark_1.9.5           
 [75] scales_1.4.0                glue_1.8.0                 
 [77] tools_4.4.2                 data.table_1.17.4          
 [79] fs_1.6.6                    rhdf5_2.50.2               
 [81] ape_5.8-1                   cards_0.6.0                
 [83] colorspace_2.1-1            nlme_3.1-168               
 [85] GenomeInfoDbData_1.2.13     beeswarm_0.4.0             
 [87] vipor_0.4.7                 cardx_0.2.4                
 [89] Formula_1.2-5               cli_3.6.5                  
 [91] S4Arrays_1.6.0              gtable_0.3.6               
 [93] sass_0.4.10                 digest_0.6.37              
 [95] BiocGenerics_0.52.0         SparseArray_1.6.2          
 [97] htmlwidgets_1.6.4           farver_2.1.2               
 [99] htmltools_0.5.8.1           multtest_2.62.0            
[101] lifecycle_1.0.4             httr_1.4.7                 
[103] MASS_7.3-65